#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#install.packages("factoextra")
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)
#Abhay
#Employment - K202301 for 2021
# Get ACS data
df1 <- get_acs(geography = "state",
table = "K202301",
year = 2021,
survey = "acs1-year",
cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202301 and caching the dataset for faster future access.
variables_df1 <- data.frame(
variable = c("K202301_001", "K202301_002", "K202301_003", "K202301_004", "K202301_005", "K202301_006", "K202301_007"),
label = c("Total", "In labor force:", "Civilian labor force:", "Employed",
"Unemployed", "In Armed Forces",
"Not in labor force")
)
# Joining the data with the variable labels to get the descriptive names
df1_labeled <- df1 %>%
left_join(variables_df1, by = "variable")
# Reshapeing the data so that state names are rows and variable labels are columns
df1_wide <- df1_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# Calculate employment rates
df1_wide <- df1_wide %>%
mutate(
EmploymentRate = `Employed` / `Total` * 100,
UnemploymentRate = `Unemployed` / `Total` * 100,
NotInLaborForceRate = `Not in labor force` / `Total` * 100
)
# Plotting employment rates
library(ggplot2)
ggplot(df1_wide, aes(x = reorder(NAME, -EmploymentRate), y = EmploymentRate)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "State", y = "Employment Rate (%)", title = "Employment Rates by State") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Plotting labor force participation
ggplot(df1_wide, aes(x = NAME)) +
geom_bar(aes(y = `Employed`, fill = "Employed"), stat = "identity") +
geom_bar(aes(y = `Unemployed`, fill = "Unemployed"), stat = "identity") +
geom_bar(aes(y = `In Armed Forces`, fill = "Armed Forces"), stat = "identity") +
scale_fill_manual(values = c("Employed" = "green", "Unemployed" = "red", "Armed Forces" = "blue")) +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "State", y = "Number of People", fill = "Category", title = "Labor Force Participation by State")
ggplot(df1_wide, aes(x = UnemploymentRate)) +
geom_histogram(binwidth = 1, fill = "tomato", color = "black") +
labs(x = "Unemployment Rate (%)", y = "Number of States", title = "Distribution of Unemployment Rates across States")
# Correlation analysis
employment_data <- df1_wide %>%
select(`Employed`, `Unemployed`, `In Armed Forces`, `Not in labor force`)
correlation_matrix <- cor(employment_data, use = "complete.obs")
# Visualize the correlation matrix
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlation_matrix, method = "circle")
# Convert state names to abbreviations
state_abbreviations <- state.abb[match(df1_wide$NAME, state.name)]
df1_wide$state <- state_abbreviations
df1_wide$EmploymentRate <- df1_wide$`Employed` / df1_wide$Total * 100
# Plot the map with employment rate
plot_usmap(data = df1_wide, values = "EmploymentRate", labels = TRUE) +
scale_fill_continuous(name = "Employment Rate (%)", label = scales::percent_format()) +
theme(legend.position = "right") +
labs(title = "Employment Rate by State in 2021")
#Neha
#Education - K201501
df2 <- get_acs(geography = "state",
table = "K201501",
year = 2021,
survey = "acs1/subject",cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201501 and caching the dataset for faster future access.
variables_df2 <- data.frame(
variable = c("K201501_001", "K201501_002", "K201501_003", "K201501_004", "K201501_005", "K201501_006", "K201501_007", "K201501_008"),
label = c("Education_Total_students", "Education_Below_9th grade", "Education_9th to 12th grade_no diploma", "Education_High_school_graduate", "Education_Some college_no degree", "Education_Associates_degree", "Education_Bachelors_degree", "Education_Graduate_professional degree")
)
df2_labeled <- df2 %>%
left_join(variables_df2, by = "variable")
df2_wide <- df2_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
Exploratory Data Analysis (EDA)
# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
library(ggplot2)
library(tidyr)
# Convert df2_wide to long format
df2_long <- df2_wide %>%
gather(key = "Education_Level", value = "Count", -NAME)
# Plotting the distribution of each education level across states
ggplot(df2_long, aes(x = NAME, y = Count, fill = Education_Level)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Distribution of Education Levels across States",
x = "State",
y = "Count")
Exploratory Data Analysis (EDA) based on percentage
# Ensure necessary libraries are installed and loaded
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("tidyr")) install.packages("tidyr")
if (!require("dplyr")) install.packages("dplyr")
library(ggplot2)
library(tidyr)
library(dplyr)
# Convert df2_wide to long format
df2_long <- df2_wide %>%
gather(key = "Education_Level", value = "Count", -NAME)
# Calculate total count for each state
df2_long <- df2_long %>%
group_by(NAME) %>%
mutate(Total = sum(Count)) %>%
ungroup()
# Calculate the percentage
df2_long <- df2_long %>%
mutate(Percentage = (Count / Total) * 100)
# Plotting the distribution of each education level across states as a percentage
ggplot(df2_long, aes(x = NAME, y = Percentage, fill = Education_Level)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Percentage Distribution of Education Levels across States",
x = "State",
y = "Percentage")
library(ggplot2)
library(tidyr)
library(dplyr)
# Convert df2_wide to long format
df2_long <- df2_wide %>%
gather(key = "Education_Level", value = "Count", -NAME)
# Calculate total count for each state only once
total_counts <- df2_long %>%
group_by(NAME) %>%
summarize(Total = sum(Count))
# Join with original data to add the Total column
df2_long <- df2_long %>%
left_join(total_counts, by = "NAME")
# Calculate the percentage for each education level
df2_long <- df2_long %>%
mutate(Percentage = (Count / Total) * 100)
# Create graphs for each educational level
unique_levels <- unique(df2_long$Education_Level)
plots <- list()
for (level in unique_levels) {
df_subset <- filter(df2_long, Education_Level == level)
p <- ggplot(df_subset, aes(x = NAME, y = Percentage, fill = Education_Level)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste("Percentage Distribution of", level, "across States"),
x = "State",
y = "Percentage") +
guides(fill = guide_legend(title = "Education Level"))
plots[[level]] <- p
}
# Print plots (optional)
for (plot in plots) {
print(plot)
}
df2_percentages <- df2_wide %>%
mutate(
`Education_Below_9th grade_Percentage` = `Education_Below_9th grade` / Education_Total_students * 100,
`Education_9th to 12th grade_no diploma_Percentage` = `Education_9th to 12th grade_no diploma` / Education_Total_students * 100,
Education_High_school_graduate_Percentage = Education_High_school_graduate / Education_Total_students * 100,
`Education_Some college_no degree_Percentage` = `Education_Some college_no degree` / Education_Total_students * 100,
Education_Associates_degree_Percentage = Education_Associates_degree / Education_Total_students * 100,
Education_Bachelors_degree_Percentage = Education_Bachelors_degree / Education_Total_students * 100,
`Education_Graduate_professional degree_Percentage` = `Education_Graduate_professional degree` / Education_Total_students * 100
)
percentage_columns <- grep("_Percentage$", names(df2_percentages), value = TRUE)
df2_long <- df2_percentages %>%
pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(percentage_columns)
##
## # Now:
## data %>% select(all_of(percentage_columns))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(df2_long, aes(x = reorder(NAME, -Education_Total_students), y = Percentage, fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Age Distribution by State",
x = "State",
y = "Percentage of Total Population",
fill = "Age Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis_d() # Adjust the color scale as needed
#Esha
#Citizenship - K200501
df3<- get_acs(
geography = "state",
table = "K200501",
year = 2021,
survey = "acs1/subject",
cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200501 and caching the dataset for faster future access.
variables_df3 <- data.frame(
variable = c("K200501_001", "K200501_002","K200501_003"),
label = c("Total","U.S. citizen", "Not a U.S. citizen")
)
df3_labeled <- df3 %>%
left_join(variables_df3, by = "variable")
df3_wide <- df3_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# Load necessary libraries
library(ggplot2)
library(tidyr)
library(dplyr)
# Convert df3_wide to long format
df3_long <- df3_wide %>%
gather(key = "Citizenship", value = "Count", -NAME)
# Calculate total count for each state
df3_long <- df3_long %>%
group_by(NAME) %>%
mutate(Total = sum(Count)) %>%
ungroup()
# Calculate the percentage
df3_long <- df3_long %>%
mutate(Percentage = (Count / Total) * 100)
# Iterate over each unique citizenship category
for (category in unique(df3_long$Citizenship)) {
# Filter data for the current citizenship category
df3_subset <- df3_long %>%
filter(Citizenship == category)
# Generate and print the plot for citizenship
p <- ggplot(df3_subset, aes(x = NAME, y = Percentage, fill = Citizenship)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = paste("Percentage Distribution of", category, "across States"),
x = "State",
y = "Percentage")
print(p)
}
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
## Warning: Removed 1 rows containing missing values (`position_stack()`).
#Srika
#Age - K200104
df4 <- get_acs(geography = "state",
table = "K200104",
year = 2021,
survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
variables_df4 <- data.frame(
variable = c("K200104_001", "K200104_002", "K200104_003", "K200104_004", "K200104_005", "K200104_006", "K200104_007", "K200104_008"),
label = c("Total_age", "Age_under_18",
"Age_18_to_24", "Age_25_to_34",
"Age_35_to_44", "Age_45_to_54", "Age_55_to_64","Age_over_64")
)
df4_labeled <- df4 %>%
left_join(variables_df4, by = "variable")
df4_wide <- df4_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df4_percentages <- df4_wide %>%
mutate(
Age_under_18_Percentage = Age_under_18 / Total_age * 100,
Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
Age_over_64_Percentage = Age_over_64 / Total_age * 100
)
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)
df4_long <- df4_percentages %>%
pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Age Distribution by State",
x = "State",
y = "Percentage of Total Population",
fill = "Age Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis_d() # Adjust the color scale as needed
# Load necessary libraries if not already loaded
library(ggplot2)
library(tidyr)
# Calculate percentages for each age group
df4_percentages <- df4_wide %>%
mutate(
Age_under_18_Percentage = Age_under_18 / Total_age * 100,
Age_18_to_24_Percentage = Age_18_to_24 / Total_age * 100,
Age_25_to_34_Percentage = Age_25_to_34 / Total_age * 100,
Age_35_to_44_Percentage = Age_35_to_44 / Total_age * 100,
Age_45_to_54_Percentage = Age_45_to_54 / Total_age * 100,
Age_55_to_64_Percentage = Age_55_to_64 / Total_age * 100,
Age_over_64_Percentage = Age_over_64 / Total_age * 100
)
# Filter columns based on names ending with "Percentage"
percentage_columns <- grep("_Percentage$", names(df4_percentages), value = TRUE)
# Reshape the data to a longer format
df4_long <- df4_percentages %>%
pivot_longer(cols = percentage_columns, names_to = "Age_Group", values_to = "Percentage")
# Create a facet grid for each age group
ggplot(df4_long, aes(x = reorder(NAME, -Total_age), y = Percentage, fill = Age_Group)) +
geom_bar(stat = "identity", position = "stack") +
facet_grid(Age_Group ~ ., scales = "free_y") +
labs(title = "Age Distribution by State",
x = "State",
y = "Percentage of Total Population",
fill = "Age Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_fill_viridis_d() # Adjust the color scale as needed
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
Age_data <- get_acs(geography = "state",
table = "K200104",
year = 2021,
geometry = TRUE,
resolution = "20m",
survey = "acs1/subject",cache_table = TRUE
)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## Loading ACSSE variables for 2021 from table K200104 and caching the dataset for faster future access.
##
|
| | 0%
|
|====== | 9%
|
|============ | 17%
|
|================== | 26%
|
|======================== | 35%
|
|=============================== | 44%
|
|===================================== | 52%
|
|=========================================== | 61%
|
|================================================= | 70%
|
|======================================================= | 79%
|
|============================================================= | 87%
|
|======================================================================| 100%
Mapping_age<-Age_data%>%
filter(variable=="K200104_007")%>%
shift_geometry()
ggplot(data = Mapping_age, aes(fill = estimate)) +
geom_sf()+
scale_fill_distiller(palette = "RdPu",
direction = 1) +
labs(title = "Median Age by State, 2021",
caption = "Data source: 2021 1-year ACS, US Census Bureau",
fill = "ACS estimate") +
theme_void()
#Niharika
# Housing - K202502
df5 <- get_acs(geography = "state",
table = "K202502",
year = 2021,
survey = "acs1", # removed /subject since we're referring to a specific table ID
cache_table = TRUE)
## Getting data from the 2021 1-year ACS
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K202502 and caching the dataset for faster future access.
# Define the variable codes and their respective labels for the housing dataset (df5)
variables_df5 <- data.frame(
variable = c("K202502_001", "K202502_002", "K202502_003"),
label = c("Total", "Owner Occupied", "Renter Occupied")
)
# Assuming df5 is your housing dataset
# Reshape the data so that state names are rows
df5_labeled <- df5 %>%
left_join(variables_df5, by = "variable") # Joining with the variable labels dataframe
# Reshape the data to have state names as rows and variable labels as columns
df5_wide <- df5_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
# df5_wide will have states as rows and different housing-related variables as columns
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# Assuming df5_wide is your dataset for housing
# Melt the data to long format for easy plotting
df5_long <- tidyr::gather(df5_wide, key = "Housing_Category", value = "Count", -NAME)
# Calculate the percentage for Owner and Renter Occupied categories
df5_long <- df5_long %>%
group_by(NAME) %>%
mutate(Total = sum(Count[which(Housing_Category == "Total")])) %>%
ungroup() %>%
mutate(Percentage = ifelse(Housing_Category != "Total", Count / Total * 100, Count))
# Define colors for each category
category_colors <- c("Owner Occupied" = "steelblue", "Renter Occupied" = "salmon", "Total" = "lightgreen")
# Function to create a bar plot
create_bar_plot <- function(data, category, title) {
ggplot(data[data$Housing_Category == category, ], aes(x = reorder(NAME, -Percentage), y = Percentage, fill = Housing_Category)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(x = "State", y = "Percentage", title = title) +
scale_y_continuous(labels = percent_format(scale = 1)) +
scale_fill_manual(values = category_colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
}
# Plot for Owner Occupied
owner_plot <- create_bar_plot(df5_long, "Owner Occupied", "Percentage of Owner Occupied Housing by State")
print(owner_plot)
# Plot for Renter Occupied
renter_plot <- create_bar_plot(df5_long, "Renter Occupied", "Percentage of Renter Occupied Housing by State")
print(renter_plot)
#install.packages("tidycensus")
# Load necessary libraries
library(tidycensus)
library(dplyr)
library(tidyr)
#install.packages("factoextra")
library(factoextra)
#install.packages("ggplot2")
#install.packages("usmap")
#install.packages("maps")
library(ggplot2)
library(usmap)
library(maps)
#Abhay
#Disabilities - K201803
# Get ACS data for the Disabilities dataset
df6 <- get_acs(geography = "state",
table = "K201803",
year = 2021,
survey = "acs1-year",
cache_table = TRUE)
## Getting data from the ACS 1-year Supplemental Estimates. Data are available for geographies with populations of 20,000 and greater.
## Loading ACSSE variables for 2021 from table K201803 and caching the dataset for faster future access.
# Assuming the variables_df6 mapping is similar to variables_df1 but for the Disabilities table
# You need to create variables_df6 with the correct variable codes and labels for the Disabilities data
variables_df6 <- data.frame(
variable = c("K201803_001", "K201803_002", "K201803_003", "K201803_004", "K201803_005", "K201803_006", "K201803_007","K201803_008","K201803_009"),
label = c("Total_people", "Total With Disabilities",
"Hearing", "Vision difficulty",
"cognative", "ambulatory difficulty",
"Self-care difficulty","Independent living difficulty","No Disability")
)
# Join your data with the variable labels to get the descriptive names
df6_labeled <- df6 %>%
left_join(variables_df6, by = "variable")
# Reshape the data so that state names are rows and variable labels are columns
df6_wide <- df6_labeled %>%
pivot_wider(names_from = label, values_from = estimate, id_cols = NAME)
df6_wide
## # A tibble: 52 × 10
## NAME Total_people Total With Disabilit…¹ Hearing `Vision difficulty`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alabama 4957633 808071 208028 152798
## 2 Alaska 702154 92390 33397 15748
## 3 Arizona 7174053 972252 298849 180792
## 4 Arkansas 2974701 517051 142133 105624
## 5 California 38724294 4324355 1140131 844049
## 6 Colorado 5715497 640346 211803 120570
## 7 Connecticut 3557526 427014 113490 78078
## 8 Delaware 987964 130551 37933 25335
## 9 District of … 659979 76754 14429 14569
## 10 Florida 21465883 2906367 812248 555361
## # ℹ 42 more rows
## # ℹ abbreviated name: ¹​`Total With Disabilities`
## # ℹ 5 more variables: cognative <dbl>, `ambulatory difficulty` <dbl>,
## # `Self-care difficulty` <dbl>, `Independent living difficulty` <dbl>,
## # `No Disability` <dbl>
# df6_wide now has states as rows and the descriptive variable labels as columns
# Calculate percentages
df6_wide <- df6_wide %>%
mutate(DisabilityRate = `Total With Disabilities` / `Total_people` * 100)
# Plotting with ggplot2
library(ggplot2)
ggplot(df6_wide, aes(x = reorder(NAME, -DisabilityRate), y = DisabilityRate)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(x = "State", y = "Disability Rate (%)", title = "Disability Rates by State") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Select only the disability-related columns for correlation
disability_data <- df6_wide %>%
select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)
# Compute correlation matrix
correlation_matrix <- cor(disability_data, use = "complete.obs") # use = "complete.obs" handles missing values
# Check if corrplot is installed; if not, install it
if (!require(corrplot)) install.packages("corrplot")
library(corrplot)
corrplot(correlation_matrix, method = "circle")
# Comparative analysis of disabilities within a state (example: Iowa)
iowa_data <- df6_wide %>%
filter(NAME == "Iowa") %>%
select(`Hearing`, `Vision difficulty`, `cognative`, `ambulatory difficulty`, `Self-care difficulty`, `Independent living difficulty`)
# Plotting the data for Iowa
barplot(as.matrix(iowa_data), beside = TRUE, legend.text = TRUE, args.legend = list(x = "topright"))
# Predicting 'Ambulatory difficulty' based on other disabilities
lm_model <- lm(`ambulatory difficulty` ~ `Hearing` + `Vision difficulty` + `cognative` + `Self-care difficulty` + `Independent living difficulty`, data = df6_wide)
# Summary of the linear model
summary(lm_model)
##
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` +
## cognative + `Self-care difficulty` + `Independent living difficulty`,
## data = df6_wide)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56621 -11941 -2766 17524 73868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8200.5856 5646.4977 -1.452 0.153197
## Hearing 0.6796 0.1793 3.791 0.000436 ***
## `Vision difficulty` 1.0070 0.1644 6.126 1.87e-07 ***
## cognative -0.5426 0.2291 -2.369 0.022116 *
## `Self-care difficulty` -1.9464 0.4434 -4.390 6.59e-05 ***
## `Independent living difficulty` 1.9228 0.3144 6.115 1.95e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24940 on 46 degrees of freedom
## Multiple R-squared: 0.9968, Adjusted R-squared: 0.9965
## F-statistic: 2909 on 5 and 46 DF, p-value: < 2.2e-16
lm_model
##
## Call:
## lm(formula = `ambulatory difficulty` ~ Hearing + `Vision difficulty` +
## cognative + `Self-care difficulty` + `Independent living difficulty`,
## data = df6_wide)
##
## Coefficients:
## (Intercept) Hearing
## -8200.5856 0.6796
## `Vision difficulty` cognative
## 1.0070 -0.5426
## `Self-care difficulty` `Independent living difficulty`
## -1.9464 1.9228
# K-means clustering
#Perform a clustering analysis to identify groups of states with similar disability profiles.
# K-means clustering with 3 centers
set.seed(123) # For reproducibility
clusters <- kmeans(disability_data, centers = 3)
# Add cluster assignment to the data
df6_wide$Cluster <- as.factor(clusters$cluster)
pca_res <- prcomp(disability_data, scale. = TRUE)
fviz_pca_biplot(pca_res, label = "none", col.ind = df6_wide$Cluster, addEllipses = TRUE)